In [1]:
# Computations
import pandas as pd
import numpy as np

# Visualisation libraries

## Text
from colorama import Fore, Back, Style
from IPython.display import Image, display, Markdown, Latex

## seaborn
import seaborn as sns
sns.set_context('paper', rc={'font.size':12,'axes.titlesize':14,'axes.labelsize':12})
sns.set_style('white')

## matplotlib
import matplotlib.pyplot as plt
from matplotlib.patches import Ellipse, Polygon
import matplotlib.gridspec as gridspec
import matplotlib.colors
from pylab import rcParams
plt.style.use('seaborn-whitegrid')
plt.rcParams['figure.figsize'] = 14, 8
plt.rcParams['axes.labelsize'] = 14
plt.rcParams['xtick.labelsize'] = 12
plt.rcParams['ytick.labelsize'] = 12
plt.rcParams['text.color'] = 'k'
%matplotlib inline

## plotly
from plotly.offline import init_notebook_mode, iplot 
import plotly.graph_objs as go
import plotly.offline as py
from plotly.subplots import make_subplots
from plotly import tools
import plotly.express as px
import plotly.figure_factory as ff
# Graphics in retina format 
%config InlineBackend.figure_format = 'retina' 

import warnings
warnings.filterwarnings('ignore')

In this article, we analyze the UCI Statlog (german credit data) from Kaggle.com.

Context

The original dataset contains 1000 entries with 20 categorial/symbolic attributes prepared by Prof. Hofmann. In this dataset, each entry represents a person who takes a credit by a bank. Each person is classified as good or bad credit risks according to the set of attributes. The link to the original dataset can be found below.

Content

It is almost impossible to understand the original dataset due to its complicated system of categories and symbols. Thus, I wrote a small Python script to convert it into a readable CSV file. Several columns are simply ignored, because in my opinion either they are not important or their descriptions are obscure. The selected attributes are:

  • Age (numeric)
  • Sex (text: male, female)
  • Job (numeric: 0 - unskilled and non-resident, 1 - unskilled and resident, 2 - skilled, 3 - highly skilled)
  • Housing (text: own, rent, or free)
  • Saving accounts (text - little, moderate, quite rich, rich)
  • Checking account (numeric, in DM - Deutsch Mark)
  • Credit amount (numeric, in DM)
  • Duration (numeric, in month)
  • Purpose (text: car, furniture/equipment, radio/TV, domestic appliances, repairs, education, business, vacation/others)
In [2]:
Data = pd.read_csv('Data/german_credit_data.csv', index_col=0)

def Data_info(Inp, Only_NaN = False):
    Out = Inp.dtypes.to_frame(name='Data Type').sort_values(by=['Data Type'])
    Out = Out.join(Inp.isnull().sum().to_frame(name = 'Number of NaN Values'), how='outer')
    Out['Percentage'] = np.round(100*(Out['Number of NaN Values']/Inp.shape[0]),2)
    if Only_NaN:
        Out = Out.loc[Out['Number of NaN Values']>0]
    return Out
#
print(Back.BLACK + Fore.CYAN + Style.NORMAL + 'The Dataset:')
display(Data.head())
print(Back.BLACK + Fore.CYAN + Style.NORMAL + 'Nan Values:')
display(Data_info(Data))
print(Back.BLACK + Fore.CYAN + Style.NORMAL + 'Dataset Shape:')
display(pd.DataFrame([Data.shape], columns = ['Instances','Attributes'],index = ['Dataset']))
The Dataset:
Age Sex Job Housing Saving accounts Checking account Credit amount Duration Purpose Risk
0 67 male 2 own NaN little 1169 6 radio/TV good
1 22 female 2 own little moderate 5951 48 radio/TV bad
2 49 male 1 own little NaN 2096 12 education good
3 45 male 2 free little little 7882 42 furniture/equipment good
4 53 male 2 free little little 4870 24 car bad
Nan Values:
Data Type Number of NaN Values Percentage
Age int64 0 0.0
Checking account object 394 39.4
Credit amount int64 0 0.0
Duration int64 0 0.0
Housing object 0 0.0
Job int64 0 0.0
Purpose object 0 0.0
Risk object 0 0.0
Saving accounts object 183 18.3
Sex object 0 0.0
Dataset Shape:
Instances Attributes
Dataset 1000 10

Preprocessing

In [3]:
Data['Sex'] = Data['Sex'].map(lambda x: x.title())
Data['Housing'] = Data['Housing'].map(lambda x: x.title())
Data['Checking account'] = Data['Checking account'].fillna('None')
Data['Checking account'] = Data['Checking account'].map(lambda x: x.title())
Data['Saving accounts'] = Data['Saving accounts'].fillna('None')
Data['Saving accounts'] = Data['Saving accounts'].map(lambda x: x.title())
Data['Purpose'] = Data['Purpose'].map(lambda x: x.title())
Data['Purpose'] = Data['Purpose'].replace({'Radio/Tv':'Radio/TV'})
Data['Risk'] = Data['Risk'].map(lambda x: x.title())
Data.columns = [x.title() for x in Data.columns]
Data.head()
Out[3]:
Age Sex Job Housing Saving Accounts Checking Account Credit Amount Duration Purpose Risk
0 67 Male 2 Own None Little 1169 6 Radio/TV Good
1 22 Female 2 Own Little Moderate 5951 48 Radio/TV Bad
2 49 Male 1 Own Little None 2096 12 Education Good
3 45 Male 2 Free Little Little 7882 42 Furniture/Equipment Good
4 53 Male 2 Free Little Little 4870 24 Car Bad

Exploratory Data Analysis

Risk Distribution

Let's see how the Risk feature is distributed across the dataset.

In [4]:
# Good and Bad Colormap
GB_Colors = ['LightCoral', 'LimeGreen']
GB_LC = 'Black'
# Male and Female Colormap
MF_Colors = ['HotPink', 'RoyalBlue']
MF_LC = 'Navy'
#
Temp = Data.groupby(['Risk'])['Risk'].agg({'count'}).rename(columns = {'count':'Count'}).reset_index(drop = False)
Temp['Percentage'] = np.round(100* Temp['Count'].values /Temp['Count'].sum(), 2)
display(Temp.style.hide_index().set_precision(2))


fig = go.Figure(data=[go.Bar(x = Temp['Percentage'], y = Temp['Risk'], text = Temp['Percentage'],
                             marker_color= GB_Colors, orientation='h')])
fig.update_traces(marker_line_color= GB_LC, marker_line_width=1, opacity=1)
fig.update_traces(texttemplate='%{text:.2}%', textposition='inside')
fig.update_layout(uniformtext_minsize= 8, uniformtext_mode='hide')
fig['layout']['xaxis'].update(range=[0, 80])
fig.update_layout(plot_bgcolor= 'white', height= 260)
fig.update_xaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True)
fig.update_yaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True)
fig.update_layout(title={'text': 'Risk Distribution',
                         'x':0.46, 'y':0.75,
                         'xanchor': 'center', 'yanchor': 'top'})
fig.show()
Risk Count Percentage
Bad 300 30.00
Good 700 70.00

Credit Distribution Histogram

In [5]:
fig = px.histogram(Data, x = 'Credit Amount', color='Risk', marginal= 'box', color_discrete_sequence= GB_Colors,
                  hover_data=['Credit Amount', 'Risk'])
fig.update_layout(title = 'Credit Distribution Histogram', plot_bgcolor= 'white')
fig.update_traces(marker_line_color= GB_LC, marker_line_width=0.5, opacity=1)
fig.update_xaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True)
fig.update_yaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True)
fig.update_yaxes(showgrid=True, gridwidth=1, gridcolor='Lightgray')
fig.show()

Gender Distribution by Risk

In [6]:
Temp = Data.groupby(['Sex','Risk'])['Risk'].agg({'count'}).rename(columns = {'count':'Count'})
Temp['Percentage'] = np.round(100* Temp.values /Temp.sum().values, 2)
display(Temp)

Temp.reset_index(drop = False, inplace = True)
fig = px.bar(Temp, y= 'Sex', x= 'Percentage', orientation='h',
             color = 'Risk', text = 'Percentage', color_discrete_sequence= GB_Colors, height= 220)
fig.update_traces(marker_line_color= GB_LC, marker_line_width=1, opacity=1)
fig.update_traces(texttemplate='%{text:.2}%', textposition='inside')
fig.update_layout(uniformtext_minsize= 8, uniformtext_mode='hide')
fig['layout']['xaxis'].update(range=[0, 80])
fig.update_layout(plot_bgcolor= 'white')
fig.update_xaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True)
fig.update_yaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True)
fig.update_layout(title={'text': 'Gender Distribution by Risk',
                         'x':0.46, 'y':0.95,
                         'xanchor': 'center', 'yanchor': 'top'})
fig.show()
Count Percentage
Sex Risk
Female Bad 109 10.9
Good 201 20.1
Male Bad 191 19.1
Good 499 49.9

Age Distribution by Risk

In [7]:
Temp = Data.groupby(['Age','Risk'])['Age'].agg({'count'}).rename(columns = {'count':'Count'})
Temp['Percentage'] = np.round(100* Temp.values /Temp.sum().values, 2)
Temp.reset_index(drop = False, inplace = True)
Temp.loc[Temp.Risk == 'Bad', 'Percentage'] = -Temp.loc[Temp.Risk == 'Bad', 'Percentage']

fig = px.bar(Temp, x= 'Age', y= 'Percentage', color = 'Risk', hover_data= ['Risk', 'Count'],
             color_discrete_sequence= GB_Colors, height= 500)

fig.update_traces(marker_line_color= GB_LC, marker_line_width=0.5, opacity=1)
fig['layout']['yaxis'].update(range=[-3, 4])
fig.update_layout(yaxis = dict(tickmode = 'array', tickvals = np.arange(-4,5), ticktext = np.abs(np.arange(-4,5))))
fig.update_traces(marker_line_color= 'Navy', marker_line_width=1.2, opacity=1)
fig.update_xaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True)
fig.update_yaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True)
fig.update_yaxes(showgrid=True, gridwidth=1, gridcolor='Lightgray')
fig.update_layout(legend_orientation='h', plot_bgcolor= 'white')
fig.update_layout(title={'text': 'Age Distribution by Risk',
                         'x':0.5, 'y':0.95,
                         'xanchor': 'center', 'yanchor': 'top'})
fig.show()

Age Distribution by Gender and Risk

In [8]:
Temp = Data[['Age','Risk','Sex']]
fig = px.box(Data, x='Sex', y='Age', color='Risk', color_discrete_sequence= GB_Colors[::-1])
fig.update_traces(quartilemethod='linear')
fig.update_layout(height= 500, width = 600)
fig.update_layout(plot_bgcolor= 'white')
fig.update_xaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True)
fig.update_yaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True, range=[10, 80])
fig.update_yaxes(showgrid=True, gridwidth=1, gridcolor='Lightgray')
fig.update_layout(title={'text': 'Age Distribution by Gender and Risk',
                         'x':0.47, 'y':0.95,
                         'xanchor': 'center', 'yanchor': 'top'})
fig.show()

Age Group and Age Category

Creating new features:

  • Age Group
  • Age Category

We can create Age Categories using statcan.gc.ca.

Interval Age Category
00-14 years Children
15-24 years Youth
25-64 years Adults
65 years and over Seniors
In [9]:
if Data.Age.min() < 14:
    bins = pd.IntervalIndex.from_tuples([(0, 14), (14, 24), (24, 64),(64, Data.Age.max())])
else:
    bins = pd.IntervalIndex.from_tuples([(14, 24), (24, 64),(64, Data.Age.max())])
Data['Age Group'] = pd.cut(Data['Age'], bins)
Data['Age Category'] = Data['Age Group'].astype(str).replace({'(14, 24]':'Youth', '(24, 64]':'Adults','(64, 75]':'Seniors'})
Data.head()
Out[9]:
Age Sex Job Housing Saving Accounts Checking Account Credit Amount Duration Purpose Risk Age Group Age Category
0 67 Male 2 Own None Little 1169 6 Radio/TV Good (64, 75] Seniors
1 22 Female 2 Own Little Moderate 5951 48 Radio/TV Bad (14, 24] Youth
2 49 Male 1 Own Little None 2096 12 Education Good (24, 64] Adults
3 45 Male 2 Free Little Little 7882 42 Furniture/Equipment Good (24, 64] Adults
4 53 Male 2 Free Little Little 4870 24 Car Bad (24, 64] Adults

Age Group Distribution by Gender and Risk

In [10]:
Temp = Data.groupby(['Sex','Age Group','Age Category','Risk'])['Risk'].agg({'count'}).rename(columns = {'count':'Count'})
Temp['Percentage'] = np.round(100* Temp.values /Temp.sum().values, 2)
Temp = Temp[(Temp.T != 0).any()]
display(Temp)
Temp.reset_index(drop = False, inplace = True)
Temp = Temp.sort_values(by=['Age Group'])
# Figures

fig = make_subplots(rows=2, cols=1, vertical_spacing = 0.05, shared_xaxes=True, subplot_titles=('Risk: Good', 'Risk: Bad'))
# Top
fig1 = px.bar(Temp.loc[Temp.Risk == 'Good'], y= 'Age Category', x= 'Percentage', orientation='h',
              color = 'Sex', text = 'Percentage', hover_data= Temp.columns,
              color_discrete_sequence = MF_Colors, height= 400)
fig.add_trace(fig1['data'][0], row=1, col=1)
fig.add_trace(fig1['data'][1], row=1, col=1)
fig.update_traces(marker_line_color= MF_LC, marker_line_width=1, opacity=1, row=1, col=1)

# Bottom
fig2 = px.bar(Temp.loc[Temp.Risk == 'Bad'], y= 'Age Category', x= 'Percentage', orientation='h',
              color = 'Sex', text = 'Percentage', hover_data= Temp.columns,
              color_discrete_sequence = MF_Colors, height= 400)

fig.add_trace(fig2['data'][0], row=2, col=1)
fig.add_trace(fig2['data'][1], row=2, col=1)
fig.update_traces(marker_line_color= MF_LC, marker_line_width=1, opacity=1, showlegend = False, row=2, col=1)

# Update

fig.update_layout(height= 600)
fig.update_layout(plot_bgcolor= 'white', legend_orientation='h')
fig.update_xaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True)
fig.update_yaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True)
fig.update_xaxes(showgrid=True, gridwidth=1, gridcolor='Lightgray')
fig.update_traces(texttemplate='%{text:.2}%', textposition='outside')
fig.update_xaxes(title_text='Percent', range=[0, 50], row=2, col=1)
fig.update_yaxes(title_text='Age Group', row=1, col=1)
fig.update_yaxes(title_text='Age Group', row=2, col=1)
fig.update_layout(title={'text': 'Age Group Distribution by Gender and Risk',
                         'x':0.50, 'y':0.92,
                         'xanchor': 'center', 'yanchor': 'top'})
fig.show()
Count Percentage
Sex Age Group Age Category Risk
Female (14, 24] Youth Bad 36 3.6
Good 48 4.8
(24, 64] Adults Bad 72 7.2
Good 148 14.8
(64, 75] Seniors Bad 1 0.1
Good 5 0.5
Male (14, 24] Youth Bad 25 2.5
Good 40 4.0
(24, 64] Adults Bad 161 16.1
Good 447 44.7
(64, 75] Seniors Bad 5 0.5
Good 12 1.2

Credit Amount Distribution by Age Group and Risk

A box plot is a statistical representation of numerical data through their quartiles. The ends of the box represent the lower and upper quartiles, while the median (second quartile) is marked by a line inside the box.

In [11]:
fig = px.box(Data, x='Age Category', y='Credit Amount', color='Risk', color_discrete_sequence= GB_Colors[::-1])
fig.update_traces(quartilemethod='linear')
fig.update_layout(height= 500, width = 600)
fig.update_layout(plot_bgcolor= 'white')
fig.update_xaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True)
fig.update_yaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True, range=[0, 20e3])
fig.update_yaxes(showgrid=True, gridwidth=1, gridcolor='Lightgray')
fig.update_layout(title={'text': 'Credit Amount Distribution by Age Group and Risk',
                         'x':0.48, 'y':0.95,
                         'xanchor': 'center', 'yanchor': 'top'})
fig.show()

Job Distribution by Risk

In [12]:
Temp = Data[['Risk','Job']]
Temp['Job'] = Temp['Job'].map({0: 'Unskilled and Non-Resident',
                        1: 'Unskilled and Resident',
                        2: 'Skilled',
                        3: 'Highly Skilled'})
Temp = Temp.groupby(['Risk','Job'])['Job'].agg({'count'}).rename(columns = {'count':'Count'})
Temp['Percentage'] = np.round(100* Temp.values /Temp.sum().values, 2)
display(Temp)
Temp.reset_index(inplace = True, drop = False)

Temp.reset_index(drop = False, inplace = True)
fig = px.bar(Temp, y= 'Job', x= 'Percentage', orientation='h', hover_data= Temp.columns,
             color = 'Risk', text = 'Percentage', color_discrete_sequence= GB_Colors, height= 300)
fig.update_traces(marker_line_color= GB_LC, marker_line_width=1, opacity=1)
fig.update_traces(texttemplate='%{text:.2}%', textposition='inside')
fig.update_layout(uniformtext_minsize= 8, uniformtext_mode='hide')
fig['layout']['xaxis'].update(range=[0, 70])
fig.update_layout(legend_orientation='h', plot_bgcolor= 'white')
fig.update_xaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True)
fig.update_yaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True)
fig.update_layout(title={'text': 'Job Distribution by Risk',
                         'x':0.56, 'y':0.95,
                         'xanchor': 'center', 'yanchor': 'top'})
fig.show()
Count Percentage
Risk Job
Bad Highly Skilled 51 5.1
Skilled 186 18.6
Unskilled and Non-Resident 7 0.7
Unskilled and Resident 56 5.6
Good Highly Skilled 97 9.7
Skilled 444 44.4
Unskilled and Non-Resident 15 1.5
Unskilled and Resident 144 14.4

Housing Distribution by Risk

In [13]:
Temp = Data.groupby(['Risk','Housing'])['Risk'].agg({'count'}).rename(columns = {'count':'Count'})
Temp['Percentage'] = np.round(100* Temp.values /Temp.sum().values, 2)
display(Temp)

Temp.reset_index(drop = False, inplace = True)
fig = px.bar(Temp, y= 'Housing', x= 'Percentage', orientation='h', hover_data= Temp.columns,
             color = 'Risk', text = 'Percentage', color_discrete_sequence= GB_Colors, height= 280)
fig.update_traces(marker_line_color= GB_LC, marker_line_width=1, opacity=1)
fig.update_traces(texttemplate='%{text:.2}%', textposition='inside')
fig.update_layout(uniformtext_minsize= 8, uniformtext_mode='hide')
fig['layout']['xaxis'].update(range=[0, 80])
fig.update_layout(plot_bgcolor= 'white')
fig.update_xaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True)
fig.update_yaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True)
fig.update_layout(title={'text': 'Housing Distribution by Risk',
                         'x':0.5, 'y':0.9,
                         'xanchor': 'center', 'yanchor': 'top'})
fig.show()
Count Percentage
Risk Housing
Bad Free 44 4.4
Own 186 18.6
Rent 70 7.0
Good Free 64 6.4
Own 527 52.7
Rent 109 10.9

Credit Amount Distribution by Housing and Risk

A violin plot is a statistical representation of numerical data. It is similar to a box plot, with the addition of a rotated kernel density plot on each side. The ends of the box represent the lower and upper quartiles, while the median (second quartile) is marked by a line inside the box. Moreover, the data distribution in each case can be seen as well.

In [14]:
fig = px.violin(Data, x='Housing', y='Credit Amount', color='Risk', box=True,
          hover_data=['Housing','Credit Amount','Risk'], color_discrete_sequence= GB_Colors[::-1])
fig.update_layout(plot_bgcolor= 'white')
fig.update_xaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True)
fig.update_yaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True, range=[-5e3, 25e3])
fig.update_yaxes(zeroline=True, zerolinewidth=1, zerolinecolor='Lightgray')
fig.update_yaxes(showgrid=True, gridwidth=1, gridcolor='Lightgray')
fig.update_layout(title={'text': 'Credit Amount Distribution by Housing and Risk',
                         'x':0.46, 'y':0.95,
                         'xanchor': 'center', 'yanchor': 'top'})
fig.show()

Credit Amount Distribution by Job and Risk

In [15]:
Temp = Data[['Risk','Job','Credit Amount']]
Temp['Job'] = Temp['Job'].map({0: 'Unskilled and Non-Resident',
                        1: 'Unskilled and Resident',
                        2: 'Skilled',
                        3: 'Highly Skilled'})
fig = px.violin(Temp, x='Job', y='Credit Amount', color='Risk', box=True,
          hover_data=['Job','Credit Amount','Risk'], color_discrete_sequence= GB_Colors[::-1])
fig.update_layout(plot_bgcolor= 'white')
fig.update_xaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True)
fig.update_yaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True, range=[-5e3, 25e3])
fig.update_yaxes(zeroline=True, zerolinewidth=1, zerolinecolor='Lightgray')
fig.update_yaxes(showgrid=True, gridwidth=1, gridcolor='Lightgray')
fig.update_layout(title={'text': 'Credit Amount Distribution by Job and Risk',
                         'x':0.46, 'y':0.95,
                         'xanchor': 'center', 'yanchor': 'top'})
fig.show()

Credit Amount by Gender and Risk

In [16]:
fig = px.box(Data, x='Sex', y='Credit Amount', color='Risk', color_discrete_sequence= GB_Colors[::-1])
fig.update_traces(quartilemethod='linear')
fig.update_layout(height= 500, width = 500, plot_bgcolor= 'white')
fig.update_xaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True)
fig.update_yaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True, range=[0, 20e3])
fig.update_yaxes(showgrid=True, gridwidth=1, gridcolor='Lightgray')
fig.update_layout(title={'text': 'Credit Amount Distribution by Gender',
                         'x':0.48, 'y':0.95,
                         'xanchor': 'center', 'yanchor': 'top'})
fig.show()

Credit Amount Distribution by Housing and Risk

In [17]:
Feat = 'Housing'
Temp = Data[[Feat,'Risk','Credit Amount']]
fig = make_subplots(rows=1, cols=2, shared_xaxes=True,
                    subplot_titles=('Credit Distribution by %s and Risk' % Feat,
                                    'Average Credit by %s and Risk' % Feat))
# Left
fig1 = px.box(Temp, x=Feat, y='Credit Amount', color='Risk', color_discrete_sequence= GB_Colors[::-1])
fig1.update_traces(quartilemethod='linear')
fig.add_trace(fig1['data'][0], row=1, col=1)
fig.add_trace(fig1['data'][1], row=1, col=1)
fig.update_traces(marker_line_color= GB_LC, marker_line_width=.2, opacity=1, showlegend = False, row=1, col=1)

# Right
Temp = Temp.groupby([Feat,'Risk'])['Credit Amount'].agg({'mean'}).rename(columns = {'mean':'Average'}).round(2)
display(Temp)
Temp.reset_index(drop = False, inplace = True)

fig2 = px.bar(Temp, x= Feat, y= 'Average', orientation='v',
             color = 'Risk', text = 'Average', barmode='group', color_discrete_sequence= GB_Colors, height= 400)
fig2.update_traces(marker_line_color= GB_LC, marker_line_width=1, opacity=1)
fig.add_trace(fig2['data'][0], row=1, col=2)
fig.add_trace(fig2['data'][1], row=1, col=2)

# Updates
fig.update_layout(boxmode='group')
fig.update_yaxes(range=[0, 20e3], row=1, col=1)
fig.update_yaxes(title_text='Average', range=[0, 6e3], row=1, col=2)
fig.update_layout(plot_bgcolor= 'white')
fig.update_xaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True)
fig.update_yaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True)
fig.update_yaxes(showgrid=True, gridwidth=1, gridcolor='Lightgray')
fig.update_layout(title={'text': 'Credit Amount Distribution by Housing and Risk',
                         'x':0.46, 'y':0.9,
                         'xanchor': 'center', 'yanchor': 'top'})
fig.show()
Average
Housing Risk
Free Bad 5536.45
Good 4472.92
Own Bad 3693.80
Good 2837.58
Rent Bad 3582.67
Good 2827.06

Credit Amount Distribution by Job and Risk

In [18]:
Feat = 'Job'
Temp = Data[[Feat,'Risk','Credit Amount']]
Temp['Job'] = Temp['Job'].map({0: 'Unskilled and Non-Resident',
                        1: 'Unskilled and Resident',
                        2: 'Skilled',
                        3: 'highly skilled'})
fig = make_subplots(rows=1, cols=2, shared_xaxes=True,
                    subplot_titles=('Credit Distribution by %s and Risk' % Feat,
                                    'Average Credit by %s and Risk' % Feat))
# Left
fig1 = px.box(Temp, x=Feat, y='Credit Amount', color='Risk', color_discrete_sequence= GB_Colors[::-1])
fig1.update_traces(quartilemethod='linear')
fig.add_trace(fig1['data'][0], row=1, col=1)
fig.add_trace(fig1['data'][1], row=1, col=1)
fig.update_traces(marker_line_color= GB_LC, marker_line_width=.2, opacity=1, showlegend = False, row=1, col=1)

# Right
Temp = Temp.groupby([Feat,'Risk'])['Credit Amount'].agg({'mean'}).rename(columns = {'mean':'Average'}).round(2)
display(Temp)
Temp.reset_index(drop = False, inplace = True)

fig2 = px.bar(Temp, x= Feat, y= 'Average', orientation='v',
             color = 'Risk', text = 'Average', barmode='group', color_discrete_sequence= GB_Colors, height= 400)
fig2.update_traces(marker_line_color= GB_LC, marker_line_width=1, opacity=1)
fig.add_trace(fig2['data'][0], row=1, col=2)
fig.add_trace(fig2['data'][1], row=1, col=2)

# Updates
fig.update_layout(boxmode='group')
fig.update_yaxes(range=[0, 20e3], row=1, col=1)
fig.update_yaxes(title_text='Average', range=[0, 7e3], row=1, col=2)
fig.update_layout(plot_bgcolor= 'white')
fig.update_xaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True)
fig.update_yaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True)
fig.update_yaxes(showgrid=True, gridwidth=1, gridcolor='Lightgray')
fig.update_layout(title={'text': 'Credit Amount Distribution by Job and Risk',
                         'x':0.46, 'y':0.9,
                         'xanchor': 'center', 'yanchor': 'top'})
fig.show()
Average
Job Risk
Skilled Bad 3642.65
Good 2831.48
Unskilled and Non-Resident Bad 3221.14
Good 2523.00
Unskilled and Resident Bad 2387.55
Good 2347.23
highly skilled Bad 6816.75
Good 4709.27

Credit Amount Distribution by Saving Accounts and Risk

In [19]:
Feat = 'Saving Accounts'
Temp = Data[[Feat,'Risk','Credit Amount']]
fig = make_subplots(rows=1, cols=2, shared_xaxes=True,
                    subplot_titles=('Credit Distribution by %s and Risk' % Feat,
                                    'Average Credit by %s and Risk' % Feat))
# Left
fig1 = px.box(Temp, x=Feat, y='Credit Amount', color='Risk', color_discrete_sequence= GB_Colors[::-1])
fig1.update_traces(quartilemethod='linear')
fig.add_trace(fig1['data'][0], row=1, col=1)
fig.add_trace(fig1['data'][1], row=1, col=1)
fig.update_traces(marker_line_color= GB_LC, marker_line_width=.2, opacity=1, showlegend = False, row=1, col=1)

# Right
Temp = Temp.groupby([Feat,'Risk'])['Credit Amount'].agg({'mean'}).rename(columns = {'mean':'Average'}).round(2)
display(Temp)
Temp.reset_index(drop = False, inplace = True)

fig2 = px.bar(Temp, x= Feat, y= 'Average', orientation='v',
             color = 'Risk', text = 'Average', barmode='group', color_discrete_sequence= GB_Colors, height= 500)
fig2.update_traces(marker_line_color= GB_LC, marker_line_width=1, opacity=1)
fig.add_trace(fig2['data'][0], row=1, col=2)
fig.add_trace(fig2['data'][1], row=1, col=2)

# Updates
fig.update_layout(boxmode='group')
fig.update_yaxes(range=[0, 20e3], row=1, col=1)
fig.update_yaxes(title_text='Average', range=[0, 5e3], row=1, col=2)
fig.update_layout(plot_bgcolor= 'white')
fig.update_xaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True)
fig.update_yaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True)
fig.update_yaxes(showgrid=True, gridwidth=1, gridcolor='Lightgray')
fig.update_layout(title={'text': 'Credit Amount Distribution by Saving Accounts and Risk',
                         'x':0.46, 'y':0.9,
                         'xanchor': 'center', 'yanchor': 'top'})
fig.show()
Average
Saving Accounts Risk
Little Bad 3925.56
Good 2773.10
Moderate Bad 4006.97
Good 3077.09
None Bad 4362.06
Good 3809.85
Quite Rich Bad 3052.82
Good 2470.42
Rich Bad 3364.67
Good 2460.36

Credit Amount Distribution by Checking Account and Risk

In [20]:
Feat = 'Checking Account'
Temp = Data[[Feat,'Risk','Credit Amount']]
fig = make_subplots(rows=1, cols=2, shared_xaxes=True,
                    subplot_titles=('Credit Distribution by %s and Risk' % Feat,
                                    'Average Credit by %s and Risk' % Feat))
# Left
fig1 = px.box(Temp, x=Feat, y='Credit Amount', color='Risk', color_discrete_sequence= GB_Colors[::-1])
fig1.update_traces(quartilemethod='linear')
fig.add_trace(fig1['data'][0], row=1, col=1)
fig.add_trace(fig1['data'][1], row=1, col=1)
fig.update_traces(marker_line_color= GB_LC, marker_line_width=.2, opacity=1, showlegend = False, row=1, col=1)

# Right
Temp = Temp.groupby([Feat,'Risk'])['Credit Amount'].agg({'mean'}).rename(columns = {'mean':'Average'}).round(2)
display(Temp)
Temp.reset_index(drop = False, inplace = True)

fig2 = px.bar(Temp, x= Feat, y= 'Average', orientation='v',
             color = 'Risk', text = 'Average', barmode='group', color_discrete_sequence= GB_Colors, height= 400)
fig2.update_traces(marker_line_color= GB_LC, marker_line_width=1, opacity=1)
fig.add_trace(fig2['data'][0], row=1, col=2)
fig.add_trace(fig2['data'][1], row=1, col=2)

# Updates
fig.update_layout(boxmode='group')
fig.update_yaxes(range=[0, 20e3], row=1, col=1)
fig.update_yaxes(title_text='Average', range=[0, 5e3], row=1, col=2)
fig.update_layout(plot_bgcolor= 'white')
fig.update_xaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True)
fig.update_yaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True)
fig.update_yaxes(showgrid=True, gridwidth=1, gridcolor='Lightgray')
fig.update_layout(title={'text': 'Credit Amount Distribution by Checking Account and Risk',
                         'x':0.46, 'y':0.9,
                         'xanchor': 'center', 'yanchor': 'top'})
fig.show()
Average
Checking Account Risk
Little Bad 3413.61
Good 2943.69
Moderate Bad 4754.75
Good 3233.93
None Bad 4286.78
Good 2980.60
Rich Bad 1725.71
Good 2306.78

Credit Amount Distribution by Purpose and Risk

In [21]:
Feat = 'Purpose'
Temp = Data[[Feat,'Risk','Credit Amount']]
fig = make_subplots(rows=1, cols=2, shared_xaxes=True,
                    subplot_titles=('Credit Distribution by %s and Risk' % Feat,
                                    'Average Credit by %s and Risk' % Feat))
# Left
fig1 = px.box(Temp, x=Feat, y='Credit Amount', color='Risk', color_discrete_sequence= GB_Colors[::-1])
fig1.update_traces(quartilemethod='linear')
fig.add_trace(fig1['data'][0], row=1, col=1)
fig.add_trace(fig1['data'][1], row=1, col=1)
fig.update_traces(marker_line_color= GB_LC, marker_line_width=.2, opacity=1, showlegend = False, row=1, col=1)

# Right
Temp = Temp.groupby([Feat,'Risk'])['Credit Amount'].agg({'mean'}).rename(columns = {'mean':'Average'}).round(2)
display(Temp)
Temp.reset_index(drop = False, inplace = True)

fig2 = px.bar(Temp, x= Feat, y= 'Average', orientation='v',
             color = 'Risk', text = 'Average', barmode='group', color_discrete_sequence= GB_Colors, height= 350)
fig2.update_traces(marker_line_color= GB_LC, marker_line_width=1, opacity=1)
fig.add_trace(fig2['data'][0], row=1, col=2)
fig.add_trace(fig2['data'][1], row=1, col=2)

# Updates
fig.update_layout(boxmode='group')
fig.update_yaxes(range=[0, 20e3], row=1, col=1)
fig.update_yaxes(title_text='Average', range=[0, 12e3], row=1, col=2)
fig.update_layout(plot_bgcolor= 'white')
fig.update_xaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True)
fig.update_yaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True)
fig.update_yaxes(showgrid=True, gridwidth=1, gridcolor='Lightgray')
fig.update_layout(title={'text': 'Credit Amount Distribution by Purpose and Risk',
                         'x':0.46, 'y':0.9,
                         'xanchor': 'center', 'yanchor': 'top'})
fig.show()
Average
Purpose Risk
Business Bad 5622.44
Good 3367.73
Car Bad 4221.95
Good 3559.97
Domestic Appliances Bad 1571.00
Good 1461.50
Education Bad 3151.87
Good 2705.00
Furniture/Equipment Bad 3645.50
Good 2794.20
Radio/TV Bad 2780.21
Good 2404.45
Repairs Bad 2791.12
Good 2692.07
Vacation/Others Bad 11566.00
Good 5811.71

Credit Amount Distribution by Risk and Gender

In [22]:
Temp = Data[['Sex','Credit Amount','Risk']]

fig = make_subplots(rows=1, cols=2, horizontal_spacing = 0.05, shared_yaxes=True, subplot_titles=('Good Risk', 'Bad Risk'))
# Left
fig1 = px.box(Temp.loc[Temp.Risk == 'Good'], x= 'Sex', y='Credit Amount', hover_data= Temp.columns,
              color='Sex', color_discrete_sequence= MF_Colors[::-1])
fig1.update_traces(quartilemethod='linear')
fig.add_trace(fig1['data'][0], row=1, col=1)
fig.add_trace(fig1['data'][1], row=1, col=1)
fig.update_traces(marker_line_color= GB_LC, marker_line_width=.2, opacity=1, showlegend = True, row=1, col=1)

# Right
fig2 = px.box(Temp.loc[Temp.Risk == 'Bad'], x= 'Sex', y='Credit Amount', hover_data= Temp.columns,
              color='Sex', color_discrete_sequence= MF_Colors)
fig2.update_traces(quartilemethod='linear')
fig.add_trace(fig2['data'][0], row=1, col=2)
fig.add_trace(fig2['data'][1], row=1, col=2)
fig.update_traces(marker_line_color= GB_LC, marker_line_width=.2, opacity=1, showlegend = False, row=1, col=2)

# Updates
fig.update_layout(boxmode='group', plot_bgcolor= 'white', width= 600)
fig.update_yaxes(range=[0, 20e3])
fig.update_xaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True)
fig.update_yaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True)
fig.update_yaxes(showgrid=True, gridwidth=1, gridcolor='Lightgray')

fig.update_layout(title={'text': 'Credit Amount Distribution by Risk and Gender',
                         'x':0.46, 'y':0.9,
                         'xanchor': 'center', 'yanchor': 'top'})
fig.show()

Duration Distribution by Risk

In [23]:
Temp = Data.groupby(['Duration','Risk'])['Risk'].agg({'count'}).rename(columns = {'count':'Count'})
Temp['Percentage'] = np.round(100* Temp.values /Temp.sum().values, 2)
Temp.reset_index(drop = False, inplace = True)

fig = px.bar(Temp, x= 'Duration', y= 'Count', color = 'Risk', hover_data= ['Risk', 'Count'],
             barmode='group',
             color_discrete_sequence= GB_Colors[::-1], height= 500)

fig.update_traces(marker_line_color= GB_LC, marker_line_width=0.5, opacity=1)
fig['layout']['yaxis'].update(range=[0, 140])
fig.update_traces(marker_line_color= 'Navy', marker_line_width=1.2, opacity=1)
fig.update_xaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True)
fig.update_yaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True)
fig.update_yaxes(showgrid=True, gridwidth=1, gridcolor='Lightgray')
fig.update_layout(legend_orientation='h', plot_bgcolor= 'white')
fig.update_layout(title={'text': 'Duration Distribution by Risk',
                         'x':0.5, 'y':0.94,
                         'xanchor': 'center', 'yanchor': 'top'})
fig.show()

fig = ff.create_distplot([Data.loc[Data.Risk == 'Good', 'Duration'],
                         Data.loc[Data.Risk == 'Bad', 'Duration']], ['Good', 'Bad'], colors= GB_Colors[::-1],
                         show_rug=False, bin_size= 1.5)
fig.update_traces(marker_line_color= GB_LC, marker_line_width=0.5, opacity= 0.7)
fig.update_layout(plot_bgcolor= 'white')
fig.update_xaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True)
fig.update_yaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True, range=[0, 0.14])
fig.update_yaxes(showgrid=True, gridwidth=1, gridcolor='Lightgray')
fig.update_layout(title={'text': 'Duration Distplot',
                         'x':0.5, 'y':0.90,
                         'xanchor': 'center', 'yanchor': 'top'})
fig.show()

Credit Amount Distribution by Duration

In [24]:
fig = px.box(Data, x='Duration', y='Credit Amount', color='Risk', color_discrete_sequence= GB_Colors[::-1])
fig.update_traces(quartilemethod='linear')
fig.update_layout(plot_bgcolor= 'white')
fig.update_xaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True)
fig.update_yaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True, range=[0, 20e3])
fig.update_yaxes(showgrid=True, gridwidth=1, gridcolor='Lightgray')
fig.update_layout(title={'text': 'Credit Amount Distribution by Duration',
                         'x':0.46, 'y':0.95,
                         'xanchor': 'center', 'yanchor': 'top'})
fig.show()